import pandas as pd
import yaml
from dagshub.streaming import install_hooks
def read_yaml(namefile='src/config.yaml'):
f = open(namefile,'rb')
diz = yaml.load(f, Loader=yaml.FullLoader)
f.close()
return diz
def clean_data(file_path):
df = pd.read_csv(file_path,index_col=0)
df.dropna(subset=['Review Text'],inplace=True)
return df
params = read_yaml()
df = clean_data(params['raw_data_path'])
df_young = df[df.Age<=36]
df.to_csv(params['all_path'])
df_young.to_csv(params['young_path'])
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
import nltk.stem
/home/eugenia/topic-modeling-reviews/venv/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
/home/eugenia/topic-modeling-reviews/venv/lib/python3.8/site-packages/requests/__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.13) or chardet (5.1.0)/charset_normalizer (2.0.12) doesn't match a supported version!
warnings.warn("urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported "
import pickle
from dagshub.streaming import install_hooks
english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedCountVectorizer(CountVectorizer):
def build_analyzer(self):
analyzer = super(StemmedCountVectorizer, self).build_analyzer()
return lambda doc: ([english_stemmer.stem(w) for w in analyzer(doc)])
def train_bert(docs,model_path):
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Clustering model: See [2] for more details
cluster_model = HDBSCAN(min_cluster_size = 15,
metric = 'euclidean',
cluster_selection_method = 'eom',
prediction_data = True)
#Explicitly define, use, and adjust the ClassTfidfTransformer with new parameters,
#bm25_weighting and reduce_frequent_words, to potentially improve the topic representation
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)
#vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))
vectorizer_model = StemmedCountVectorizer(analyzer="word",stop_words="english", ngram_range=(1, 2))
# BERTopic model
topic_model = BERTopic(embedding_model = embedding_model,
hdbscan_model = cluster_model,
ctfidf_model=ctfidf_model,
vectorizer_model=vectorizer_model,
language="english")
# Fit the model on a corpus
topics, probs = topic_model.fit_transform(docs)
topic_model.save(model_path)
return topic_model
def load_bert(model_path):
topic_model = BERTopic.load(model_path)
return topic_model
# Install hooks
install_hooks(repo_url='https://dagshub.com/eugenia.anello/topic-modeling-reviews')
# remove data from local PC: rm -r data
print('Load data from remote!')
with open(params['young_path']) as pd_file:
df_young = pd.read_csv(pd_file,index_col=0)
pd_file.close()
Load data from remote!
docs = df_young['Review Text'].values.tolist()
print(docs[0])
with open('docs.pkl', 'wb') as f:
pickle.dump(docs, f)
f.close()
Absolutely wonderful - silky and sexy and comfortable
print('Start training!')
if params['model_already_trained']==False:
topic_model = train_bert(docs,params['model_path'])
else:
topic_model = load_bert(params['model_path'])
print('End training!')
print(topic_model.get_topic_freq().head())
Start training! End training! Topic Count 0 0 1901 1 -1 1573 2 1 598 3 2 438 4 3 404
freq_df = topic_model.get_topic_info()
print("Number of topics: {}".format( len(freq_df)))
freq_df['Percentage'] = round(freq_df['Count']/freq_df['Count'].sum() * 100,2)
freq_df = freq_df.iloc[:,[0,1,3,2]]
freq_df.head()
Number of topics: 37
| Topic | Count | Percentage | Name | |
|---|---|---|---|---|
| 0 | -1 | 1573 | 21.03 | -1_look_size_color_fit |
| 1 | 0 | 1901 | 25.41 | 0_dress_size_fit_wear |
| 2 | 1 | 598 | 7.99 | 1_love_look_great_like |
| 3 | 2 | 438 | 5.85 | 2_shirt_love shirt_love_small |
| 4 | 3 | 404 | 5.40 | 3_pant_jean_waist_stretch |
freq_df.tail()
| Topic | Count | Percentage | Name | |
|---|---|---|---|---|
| 32 | 31 | 21 | 0.28 | 31_green_green color_dress green_color |
| 33 | 32 | 20 | 0.27 | 32_pregnant_babi_matern_pregnanc |
| 34 | 33 | 19 | 0.25 | 33_cami_need cami_need_cami underneath |
| 35 | 34 | 18 | 0.24 | 34_sock_feet_stay_heel |
| 36 | 35 | 15 | 0.20 | 35_print_tiger print_poodl_tiger |
freq_df['Count'].sum()
7481
freq_df['Percentage'] = round(freq_df['Count']/freq_df['Count'].sum() * 100,2)
freq_df.head()
| Topic | Count | Name | Percentage | |
|---|---|---|---|---|
| 0 | -1 | 1573 | -1_look_size_color_fit | 21.03 |
| 1 | 0 | 1901 | 0_dress_size_fit_wear | 25.41 |
| 2 | 1 | 598 | 1_love_look_great_like | 7.99 |
| 3 | 2 | 438 | 2_shirt_love shirt_love_small | 5.85 |
| 4 | 3 | 404 | 3_pant_jean_waist_stretch | 5.40 |
def load_bert(model_path):
model = BERTopic.load(model_path)
return model
topic_model = load_bert(params['model_path'])
fig1 = topic_model.visualize_topics()
fig1.show()
# Save topic-terms barcharts as HTML file
fig2 = topic_model.visualize_barchart(top_n_topics = 10)
fig2.show()
# Save documents projection as HTML file
fig3 = topic_model.visualize_documents(docs)
fig3.show()
# Save topics dendrogram as HTML file
fig4 = topic_model.visualize_hierarchy()
fig4.show()
fig5 = topic_model.visualize_heatmap(n_clusters=10, width=1000, height=1000)
fig5.show()
fig1.write_html("output/intertopic_dist_map.html")
fig2.write_html("output/barchart.html")
fig3.write_html("output/projections.html")
fig4.write_html("output/hierarchy.html")
fig5.write_html("output/heatmap.html")
import os
import mlflow
from dagshub import dagshub_logger
l_html = os.listdir('output')
print(l_html)
['hierarchy.html', '.gitignore', 'barchart.html', 'intertopic_dist_map.html', 'docs.pkl', 'heatmap.html', 'projections.html']
# l_html.remove('docs.pkl')
# l_html.remove('.gitignore')
mlflow.set_tracking_uri(params['mlflow_url'])
os.environ['MLFLOW_TRACKING_USERNAME'] = params['MLFLOW_TRACKING_USERNAME']
os.environ['MLFLOW_TRACKING_PASSWORD'] = params['MLFLOW_TRACKING_PASSWORD']
_ = mlflow.create_experiment("topic_modeling")
with mlflow.start_run():
with dagshub_logger() as logger:
logger.log_hyperparams({"model_name": 'BERTopic'})
for html_path in l_html:
mlflow.log_artifact('output/'+html_path)